from urllib.request import *
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import datetime
import folium
from folium.plugins import *
import branca.colormap
import seaborn as sns
from collections import defaultdict

sns.set(color_codes=True)
%matplotlib inline

import plotly.io as pio
pio.renderers.default='notebook'


focuscrimes = set(['WEAPON LAWS', 'PROSTITUTION', 'DRIVING UNDER THE INFLUENCE', 'ROBBERY', 'BURGLARY', 'ASSAULT', 'DRUNKENNESS', 'DRUG/NARCOTIC', 'TRESPASS', 'LARCENY/THEFT', 'VANDALISM', 'VEHICLE THEFT', 'STOLEN PROPERTY', 'DISORDERLY CONDUCT'])


df = pd.read_csv("https://data.sfgov.org/api/views/tmnf-yvry/rows.csv?accessType=DOWNLOAD", header='infer')


# Create timestamp for better management of date and time information
df['Timestamp'] = df[['Date', 'Time']].agg('-'.join, axis=1)
df['Timestamp']=pd.to_datetime(df['Timestamp'], format="%m/%d/%Y-%H:%M")


## Filter only by the focuscrimes categories
df1 = df[df["Category"].isin(focuscrimes) == True]


## Select data from 2013 forward
df1 = df1[df1["Timestamp"].dt.year > 2012]


# The "location" column is simply used for the count() aggregation function after the groupby, then
# this column is renamed to Cases
df_week = df1[["DayOfWeek","Category","location"]].groupby(["Category","DayOfWeek"], as_index=False).count()
df_week.columns = ["Category","DayOfWeek", "Cases"]


def run_vis():
    rows = 7
    cols = 2
    fig = make_subplots(
    rows=rows, cols=cols, subplot_titles=[i for i in focuscrimes],
    vertical_spacing=0.08
    )
    categoryarray = df1.DayOfWeek.unique() # labels of the plot

    i = 1; j = 0
    for category in focuscrimes:
      j += 1
      section = df_week[df_week["Category"] == category]
      fig.add_trace(go.Bar(x=section["DayOfWeek"], y=section["Cases"]), row=i, col=j)
      if j == 2:
        j = 0 
        i += 1   

    fig.update_layout(height=1400, width=800, showlegend=False)
    fig.update_xaxes(categoryorder='array', categoryarray= categoryarray)
    return fig


run_vis()


months = pd.DatetimeIndex(df1["Timestamp"]).month.to_numpy()
df1["Month"] = months


df_month = df1[["Month",'location']].groupby("Month", as_index=False).count()
df_month.columns = ["Month","Cases"]


def run_vis():
    fig = go.Figure()
    fig.add_trace(go.Bar(x=df_month["Month"], y=df_month["Cases"]))

    fig.update_layout(xaxis_title="Month", yaxis_title="Crime cases")
    fig.update_yaxes(type="log")
    fig.update_xaxes(categoryorder='array', categoryarray= ["January","February",
                                                            "March","April","May",
                                                            "June","July","August",
                                                            "September","October","November","December"])
    fig.update_layout(height=400, width=1000, showlegend=False)
    return fig


run_vis()


hours = pd.DatetimeIndex(df1["Timestamp"]).hour.to_numpy()
df1["Hour"] = hours


df_24hour = df1[["Category", "Hour", "location"]].groupby(["Category", "Hour"], as_index=False).count()
df_24hour.columns = ["Category", "Hour", "Cases"]


def run_vis():
    rows = 7
    cols = 2
    fig = make_subplots(
    rows=rows, cols=cols, subplot_titles=[i for i in focuscrimes],
    vertical_spacing=0.05
    )

    i = 1; j = 0
    for category in focuscrimes:
      j += 1
      section = df_24hour[df_24hour["Category"] == category]
      fig.add_trace(go.Bar(x=section["Hour"], y=section["Cases"]), row=i, col=j)
      if j == 2:
        j = 0 
        i += 1   

    fig.update_layout(height=1400, width=800, showlegend=False)
    return fig


run_vis()


df1["hourofweek"] = (pd.DatetimeIndex(df1["Date"]).weekday*24 )+df1["Hour"].astype(float)


df_weekhour = df1[["Category", "hourofweek", "location"]].groupby(["Category", "hourofweek"], as_index=False).count()
df_weekhour.columns = ["Category", "Hourofweek", "Cases"]


def run_vis():
    rows = 7
    cols = 2
    fig = make_subplots(
    rows=rows, cols=cols, subplot_titles=[i for i in focuscrimes],
    vertical_spacing=0.05,
    )

    i = 1; j = 0
    for category in focuscrimes:
      j += 1
      section = df_weekhour[df_weekhour["Category"] == category]
      fig.add_trace(go.Bar(x=section["Hourofweek"], y=section["Cases"], width=1), row=i, col=j)

      if j == 2:
        j = 0 
        i += 1   

    fig.update_layout(height=1400, width=1200, showlegend=False)
    return fig


run_vis()


Selection=(df.groupby(['Category'])
        .agg({'Y': 'mean', 'Category': 'count'}))
display(Selection['Category'].sort_values(ascending=False).head(1))

Category
LARCENY/THEFT    477975
Name: Category, dtype: int64


df_LT_sample = df.loc[((df.Category == 'LARCENY/THEFT') 
                       &(df['Timestamp'].dt.year == 2016)
                       &(df['Timestamp'].dt.month > 6)
                       &(df['Timestamp'].dt.hour == 18)), ['Timestamp']]

plt.figure(figsize = [12, 6])

ax = sns.stripplot( x = df_LT_sample['Timestamp'].dt.minute)

#Addition of density function to the jitter plot for a better distinguishing the most frequent minutes.

ax = sns.kdeplot(data = df_LT_sample['Timestamp'].dt.minute , bw_method= 0.1, shade=True, color="b");

/tmp/ipykernel_25728/1855090585.py:12: FutureWarning:


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.


display(Selection['Y'][(Selection['Category']>=1000)&(Selection['Category']<=10000)].sort_values(ascending=False))

Category
DISORDERLY CONDUCT             37.782273
KIDNAPPING                     37.774333
LOITERING                      37.773842
SEX OFFENSES, FORCIBLE         37.772659
EMBEZZLEMENT                   37.770820
DRUNKENNESS                    37.770649
LIQUOR LAWS                    37.767707
SUICIDE                        37.766082
DRIVING UNDER THE INFLUENCE    37.765672
ARSON                          37.757443
RECOVERED VEHICLE              37.754797
Name: Y, dtype: float64


df_hist_sample = df[(df['Category'].isin(['DISORDERLY CONDUCT', 'RECOVERED VEHICLE'])) 
                   & (df.Y <40.0)]


#numpy.histogram
hist1, bin_edges1= np.histogram(df_hist_sample[df_hist_sample.Category == 'DISORDERLY CONDUCT'].Y, bins=50)
hist2, bin_edges2= np.histogram(df_hist_sample[df_hist_sample.Category == 'RECOVERED VEHICLE'].Y, bins=50)

#plot
fig = plt.figure(figsize = [12, 6])
ax = plt.hist(df_hist_sample[df_hist_sample.Category == 'DISORDERLY CONDUCT'].Y, bins = bin_edges1, alpha = 0.5);
ax = plt.hist(df_hist_sample[df_hist_sample.Category == 'RECOVERED VEHICLE'].Y, bins = bin_edges2, alpha = 0.5);
plt.legend(labels = ['DISORDERLY CONDUCT', 'RECOVERED VEHICLE']);


"""Generate folium San Francisco base map with Stamen toner tile """
def generateBaseMap(default_location=[37.77919, -122.41914], default_zoom_start=12):
    base_map = folium.Map(location=default_location, control_scale=True, 
                          zoom_start=default_zoom_start, tiles="Stamen toner")
    return base_map
base_map = generateBaseMap()


# Select DRUG/NARCOTIC category in June-July 2016 
df2 = df[(df["Category"]=="DRUG/NARCOTIC") 
         & (pd.DatetimeIndex(df["Date"]).month.isin([6,7])) 
         & (pd.DatetimeIndex(df["Date"]).year == 2016)]


""" Displays the code and time of the crime as additional info for the plot markers """
def crime_info(row):
    incident_code = str(row["Incident Code"])
    date = str(row["Date"])
    time = str(row["Time"])
    
    info = "Inc.Code: " + incident_code+" Date: "+date+" "+time
    return info


"""Creates array of coordinates to input folium map marker map"""
def to_coordinates(row):
    return [row["Y"],row["X"]]


coord = df2.apply(to_coordinates,axis=1).to_numpy().tolist() # Coordinates of the crimes
info = df2.apply(crime_info, axis=1).to_numpy().tolist() # Crime info

base_map = generateBaseMap()
marker_cluster = folium.plugins.MarkerCluster().add_to(base_map) # Generate marker clusters for better visualization
for i in range(len(coord)):
    folium.Marker(location=coord[i], popup=info[i]).add_to(marker_cluster) # Add crime marker to the map
    
base_map


thursdayhour_events = df1[df1["hourofweek"] == 84][["Date", "hourofweek"]].groupby("Date", as_index=False).count()
thursdayhour_events.columns = ["Date", "Cases"]
tonumpy = thursdayhour_events["Cases"].to_numpy()


mu = np.mean(tonumpy)
sig = np.std(tonumpy)
max_day = np.max(tonumpy)

plt.figure(figsize = [12, 6])
plt.hist(tonumpy, bins = 50,
              alpha = 0.5, color="green");
plt.xlabel("Number of ocurrences during 84th hour of the week")

print("Mean: %1.2f \t" %mu, "Stand. Dev: %1.2f \t" %sig)

Mean: 12.18 	 Stand. Dev: 5.50


thursdayhour_events_outliers = thursdayhour_events[(thursdayhour_events["Cases"]>25) 
                                                   | (thursdayhour_events["Cases"]<1)]
outlier_dates = thursdayhour_events_outliers["Date"].to_numpy()


## Filter only by the focuscrimes categories
df1 = df[df["Category"].isin(focuscrimes) == True]


## Select data from 2013 forward
df1 = df1[df1["Timestamp"].dt.year > 2012]


## Drop outliers
df1.drop(df1[(df1["Date"].isin(outlier_dates)) & (df["Category"]=="PROSTITUTION")].index, inplace=True)

/tmp/ipykernel_25728/2854002743.py:2: UserWarning:

Boolean Series key will be reindexed to match DataFrame index.


hours = pd.DatetimeIndex(df1["Timestamp"]).hour.to_numpy()
df1["Hour"] = hours
df1["hourofweek"] = (pd.DatetimeIndex(df1["Date"]).weekday*24 )+df1["Hour"].astype(float)


df_weekhour = df1[["Category", "hourofweek", "location"]].groupby(["Category", "hourofweek"], as_index=False).count()
df_weekhour.columns = ["Category", "Hourofweek", "Cases"]


## Zoom in PROSTITUTION category
section = df_weekhour[df_weekhour["Category"] == "PROSTITUTION"]
fig = go.Figure(go.Bar(x=section["Hourofweek"], y=section["Cases"], 
                       width=1, marker_color="#FFA15A"))
fig.update_layout(xaxis_title="Hour of the week",
                  yaxis_title = "Cases", height=400, width=1000, title="PROSTITUTION")
fig.show()


df2 = df[df["Category"] == "SEX OFFENSES, NON FORCIBLE"]
base_map = generateBaseMap()
# We add a marker with a popup sentence and a different type, we chose info-sign with color blue.
folium.Marker(location=[37.775471, -122.403717], popup="City Hall",
              icon=folium.Icon(color="blue", icon="info-sign"),).add_to(base_map)
HeatMap(data=df2[['Y', 'X', 'location']].groupby(['Y', 'X']).count().reset_index().values.tolist(), 
        radius=15, max_zoom=10).add_to(base_map)
base_map


print("Dataframe length:", len(df2), "\nNumber of unique longitudes:", len(df2.X.unique()), 
      "\tNumber of unique latitudes:", len(df2.X.unique()) )

Dataframe length: 43 
Number of unique longitudes: 40 	Number of unique latitudes: 40

San Francisco Crimes.¶

Import Libraries and prepare jupyter environment for support visualization¶

Part 1: Temporal Patterns¶

Weekly pattern¶

Weekly pattern visualization and analysis¶

Monthly pattern¶

monthly visualization and analysis¶

24 hour cycle¶

24 hour cycle visualization and analysis¶

Hours of the week¶

Weel hourly visualization and analysis¶

Part 2: Jitter plot¶

Part 4: Geo-data¶

Histogram Analysis¶

GEOJson visualization with Folium¶

Part 5: Errors in the data. The importance of looking at raw (or close to raw) data.¶

1. Anomaly in prostitution on Thursdays¶

2. Anomaly in recorded time of the crime¶

3. Unlikely congragation of sex offences in the hall of justice¶